Added hook to tokenizer and to parser for language specific
authorJens Frank <jeluf@users.mediawiki.org>
Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
committerJens Frank <jeluf@users.mediawiki.org>
Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
processing.

Using this hook, added a conversion of spaces to non-breaking
spaces for the French wikipedia.

Switched -----  -> <hr> processing to tokenizer.

includes/Parser.php
includes/Tokenizer.php
languages/Language.php
languages/LanguageFr.php

index 2e4e802..e92f6c3 100644 (file)
@@ -360,7 +360,7 @@ class Parser
                $text = $this->removeHTMLtags( $text );
                $text = $this->replaceVariables( $text );
 
-               $text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
+               $text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
                $text = str_replace ( "<HR>", "<hr>", $text );
 
                $text = $this->doHeadings( $text );
@@ -542,6 +542,8 @@ class Parser
 
        /* private */ function replaceInternalLinks( $str )
        {
+               global $wgLang; # for language specific parser hook
+
                $tokenizer=Tokenizer::newFromString( $str );
                $tokenStack = array();
                
@@ -596,6 +598,9 @@ class Parser
                                        }
                                        $tagIsOpen = (count( $tokenStack ) != 0);
                                        break;
+                               case "----":
+                                       $txt = "\n<hr>\n";
+                                       break;
                                case "'''":
                                        # This and the three next ones handle quotes
                                        $txt = $this->handle3Quotes( $state, $token );
@@ -611,9 +616,13 @@ class Parser
                                        $txt="";
                                        break;
                                default:
-                                       # An unkown token. Highlight.
-                                       $txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
-                                       $txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
+                                       # Call language specific Hook.
+                                       $txt = $wgLang->processToken( $token, $tokenStack );
+                                       if ( NULL == $txt ) {
+                                               # An unkown token. Highlight.
+                                               $txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
+                                               $txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
+                                       }
                                        break;
                        }
                        # If we're parsing the interior of a link, don't append the interior to $s,
index d7eb080..beeda47 100644 (file)
@@ -26,22 +26,27 @@ class Tokenizer {
        function preParse()
        {
                global $wgLang;
+
+               # build up the regex, step by step.
+               # Basic features: Quotes for <em>/<strong> and hyphens for <hr>
+               $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
+               # Append regex for linkPrefixExtension 
                if (  $wgLang->linkPrefixExtension() ) {
-                       $regex = "/(([a-zA-Z\x80-\xff]+)\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/";
-                       #          000000000000000000000000000000000000000000000000000000
-                       #           1111111111111111111111111111111111111111111111111111
-                       #            222222222222222222
-                       # which $this->mMatch[...] will contain the match.
+                       $regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
                } else {
-                       $regex = "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/";
+                       $regex .= "|\[\[";
                }
+               # Closing link
+               $regex .= "|\]\]";
+               # Language-specific additions
+               $regex .= $wgLang->tokenizerRegex();
+               # Finalize regex
+               $regex = "/(" . $regex . ")/";
 
+               # Apply the regex to the text
                $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
                                                PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
                $this->mMatchPos=0;
-               # print( "<pre>" );
-               # print_r( $this->mMatch );
-               # print( "</pre>" );
        }
 
        function nextToken()
@@ -76,6 +81,12 @@ class Tokenizer {
                                        $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
                                } else {
                                        $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
+                                       if ( substr($token["type"],1,4) == "----" )
+                                       {
+                                               # any number of hyphens bigger than four is a <HR>. 
+                                               # strip down to four.
+                                               $token["type"]="----";
+                                       }
                                }
                                # What the pointers would change to if this would not just be a preview
                                $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
index 100e4cf..df7e396 100644 (file)
@@ -1732,6 +1732,20 @@ class Language {
        {
                return "<em>$text</em>";
        }
+
+       # returns additional Regex for the tokenizer. See LanguageFr.php for an example
+       function tokenizerRegex()
+       {
+               return "";
+       }
+
+       # Process the token generated from the tokenizer by the above regex. Return
+       # NULL if the token is unknown, and the text to be added to the output otherwise
+       function processToken( &$token , &$tokenStack)
+       {
+               return NULL;
+       }
+
 }
 
 @include_once( "Language" . ucfirst( $wgLanguageCode ) . ".php" );
index 2e1a858..86ee9a6 100644 (file)
@@ -1066,6 +1066,32 @@ class LanguageFr extends Language
                else return $m;
 
        }
+
+       # returns additional Regex for the tokenizer.
+       function tokenizerRegex()
+       {
+               return "| [:»!?]|« |[0-9] [0-9]";
+       }
+
+       # Process the token generated from the tokenizer by the above regex. Return
+       # NULL if the token is unknown, and the text to be added to the output otherwise
+       function processToken( &$token , &$tokenStack)
+       {
+               if ( preg_match( "/ ([:»!?])/", $token["type"], $m ) )
+               {
+                       $txt = "&nbsp;" . $m[1];
+               } elseif ( "« " == $token["type"] )
+               {
+                       $txt = "«&nbsp;";
+               } elseif ( preg_match( "/([0-9]) ([0-9])/", $token["type"], $m ) )
+               {
+                       $txt = $m[1] . "&nbsp;" . $m[2];
+               } else
+               {
+                       $txt = NULL;
+               }
+               return $txt;
+       }
 }
 
 ?>